/* sHT -- find lexeme boundaries
   Copyright (C) 2019 Ariadne Devos

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>. */

#ifndef _sHT_LEX_H
#define _sHT_LEX_H

#include <stddef.h>
#include <stdint.h>

/** Detecting lexeme boundaries

  This module detects lexeme boundaries for variable-length
  strings of bytes belonging to a certain class, followed by
  a terminator byte.

  TODO: 'ignore' variant. */

/** Accumulates bytes

  It is to be interpreted in the context of a @var{sHT_lex_state} named
  @var{c}. It is disjoint from @var{c}. */
struct sHT_lex_buf
{
	/** The number of accumulated bytes, therefore, the number
	  of meaningful bytes in @var{bytes}. (R/W, not accessed concurrently)
	  (less than @code{c->max_known}) */
	uint16_t offset;
	/** Some accumulated bytes (R/W, not accessed concurrently).
	  Its capacity is @code{c->max_known} and its length @var{offset}. */
	unsigned char bytes[];
};

/** Lexing parameters

  The lexeme boundary and length detection may speculatively be incorrect.
  All fields are readable and read-only. */
struct sHT_lex_type
{
	/** Lex a string into its syntactical element

	  At least, that's the common use case.
	  The first index is for syntax errors, the second for
	  well-formed strings. For the former, @var{n} is the index of the
	  syntax error; for the latter, @var{n} is the index of the offending
	  byte.

	  @var{to}: the first argument passed to @var{sHT_lex}
	  @var{str}: the readable string to lex, not modified concurrently,
	    non-speculatively including the terminating @var{c_stop} byte.
	    Either @var{to->bytes} or the second argument of @var{sHT_lex}.
	  @code{n}: the length of the string to lex,
	    non-speculatively excluding the terminator
	    (not greater than @var{max_known}).
	  @var{ret}: the number of bytes @var{sHT_lex} parsed */
	size_t (* cb_value[2])(struct sHT_lex_buf *to, unsigned char *str, size_t n, size_t ret);
	/** The lexeme is longer than any known

	  @var{to}: the first argument passed to @var{sHT_lex}
	  @var{ret}: the number of bytes @var{sHT_lex} parsed */
	size_t (* cb_ignore)(struct sHT_lex_buf *to, size_t ret);

	/** The lexeme is longer than any known, but it has been parsed

	  The first index is for syntax errors, the second for well-formed
	  strings. The terminator or syntax error is included in @var{ret}.

	  @var{ret}: the number of bytes @var{sHT_lex_skip} parsed
	  @var{x}: the first argument passed to @var{sHT_lex_skip} */
	size_t (* cb_skip_done[2])(size_t ret, void *c);

	/** A byte class represented by a readable bitvector, not modified
	  concurrently (probably not at all), indexed by the byte to test
	  for its well-formedness. If set, the byte is within the set,
	  otherwise, it isn't. */
	const unsigned char *c_allow;
	/** The maximal length of any known lexeme, including the terminating
	  @var{c_stop} byte (positive, < 2**15; therefore, less than
	  @var{SSIZE_MAX}, as @var{size_t} must be at least a @var{uint16_t}) */
	uint_least16_t max_known;
	/** The terminator byte. Does not belong to @var{c_allow}. */
	unsigned char c_stop;
};
_Static_assert((size_t) -1 >= (uint_least16_t) -1, "size_t is too small!");

/** Find the lexeme boundary of a scattered string

  @var{to}: a buffer to accumulate bytes to
  @var{from}: a string to take bytes from, readable, not modified concurrently
  @var{n}: the length of @var{from} (positive, less than SSIZE_MAX)
  @var{c}: how does a lexeme look like, and what to do when?

  @var{from} is disjoint from @var{to} and @var{c}.
  If not calling into @var{c}, return the number of parsed bytes, including
  the terminator, if any. The first time, @code{c->offset} must be set to zero.

  Speculatively, the boundaries and syntax error detection may be incorrect. */
size_t
sHT_lex(struct sHT_lex_buf *to, const unsigned char from[], size_t n, const struct sHT_lex_type *c);

/** Skip some bytes of @var{from}

  @var{from}: a readable buffer, not modified concurrently, to ignore
  @var{n}: the length of @var{from} (positive, less than @var{SSIZE_MAX})
  @var{c}: how does a lexeme look like, and what to do when?
  @var{x}: ignored, may be used by @var{c} callbacks

  This function does not modify anything, except for what it tail-callees do.
  @code{c->cb_skip_done} may be tail-called. Otherwise, return the number of
  parsed bytes. The syntax and terminator detection may speculatively be
  incorrect. */
size_t
sHT_lex_skip(const unsigned char from[], size_t n, const struct sHT_lex_type *c, void *x);

#endif

